import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')
Data skimmed through to see what are the variables present, data type, shape, column names, mixed data types, missing values etc
dataset = pd.read_csv('vehicle-1.csv')
dataset.head()
dataset.info()
print(dataset.isna().values.any())
print(dataset.isnull().values.any())
There are missing values in the data that have to be treated.
dataset.shape
dataset['class'].value_counts()
Observations -
Data contains some missing values. There are 19 columns with class being the target variable. The data contains sample bias with more samples on cars than buses and vans. All the columns are int/float type except for the target column which is object type.
dataset.describe().T
def univariate_plots(Source):
a = pd.Series(Source.select_dtypes(include=['float64']).columns)
leng = len(a)
for j in range(0,len(a)):
plt.Text('Figure for float64')
f, axes = plt.subplots(1, 2, figsize=(10, 10))
sns.boxplot(Source[a[j]], ax = axes[0])
sns.distplot(Source[a[j]], ax = axes[1])
plt.subplots_adjust(top = 1.5, right = 10, left = 8, bottom = 1)
univariate_plots(dataset)
Observations from the Univariate analysis -
temp_df = dataset.drop(['class'], axis='columns')
sns.pairplot(temp_df, diag_kind='kde')
We can observe from the pairplots that many columns are correlated and many columns have long tail so that is the indication of outliers. The degree of correlation will be estalished in the correlation matrix below.
def EDA_Corr(df):
"""This gives output as Covariance matrix and feature wise uniquess i.e how much its statistically
independent. This is done with default range of corr between +0.5 to -0.6"""
corr = df.corr()
index = corr.columns
Output = []
for i in range(0,len(index)):
i = index[i]
Pos = corr.index[(corr[i] >= 0.5)].tolist()
No = corr.index[(corr[i] < 0.5) & (corr[i] > -0.6)].tolist()
Neg = corr.index[(corr[i] <= -0.5)].tolist()
leng_u = len(No)
leng_pos = len(Pos)
leng_neg = len(Neg)
Out = [i, leng_u, leng_pos, leng_neg, Pos, Neg, No]
Output.append(Out)
fig, ax = plt.subplots(figsize=(20,7))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr,annot=True,vmin=-1,vmax=1,cmap=cmap, linewidths=0, ax = ax)
EDA_Corr(dataset)
From above correlation matrix we can see that there are many features which are highly correlated. If we observe carefully then scaled_variance.1 and scatter_ratio has correlation coeff of 1 and many other features also there which are having more than 0.9 correlation. In these cases we will exclude these columns where correlation is +-0.9 or above.
So there are 8 such columns:
We can safely remove these columns from the list of predictors and still achieve a very high rate of success
sns.countplot(dataset['class'])
From above we can see that cars are most followed by bus and then vans
This is a data preprocessing step that includes :-
print(dataset.isna().values.any())
print(dataset.isnull().values.any())
dataset[dataset.isna().values == True].shape
We see that there are a total of 41 rows with missing values in the datasets.
One method can be to simply drop these data points, but can we do better?
The Approach we will follow to fill missing values is that we will replace the nan value with the median value of that column. One additional step that we will take here is that we will be slicing the entire dataset into 3 categorical divisions - car, van and bus. This will allow us to get finer medians for every column
lst = ['car', 'van', 'bus']
for item in lst:
print('There were {} missing rows for dataset class type {}'.format(
dataset[dataset['class'] == item][dataset[dataset['class'] == item].isna().values == True].shape[0], item))
dataset[dataset['class'] == item] = dataset[dataset['class'] == item].fillna(dataset[dataset['class'] == item]
.median())
print()
print('After replacing with median there are {} missing rows for dataset class type {}'.format(
dataset[dataset['class'] == item][dataset[dataset['class'] == item].isna().values == True].shape[0], item))
print('-------------------------------------------------------------------------------------')
print(dataset.isna().values.any())
print(dataset.isnull().values.any())
No missing value in the dataset anymore
cols_with_outliers = ['radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scaled_variance',
'scaled_variance.1', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1']
for col in cols_with_outliers:
print('For column:', col)
q1 = np.quantile(dataset[col],0.25)
q2 = np.quantile(dataset[col],0.50)
q3 = np.quantile(dataset[col],0.75)
IQR = q3-q1
print("Quartie1::",q1)
print("Quartie2::",q2)
print("Quartie3::",q3)
print("Inter Quartie Range::",IQR)
thresh = dataset[col].quantile(0.75)+(1.5 * IQR)
print(col, "above", dataset[col].quantile(0.75)+(1.5 * IQR),"are outliers")
print('The Outliers in', col,'column are', dataset[dataset[col]>thresh][col].shape[0])
print()
cols_with_outliers = ['radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scaled_variance',
'scaled_variance.1', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1']
for col in cols_with_outliers:
q1 = np.quantile(dataset[col],0.25)
q2 = np.quantile(dataset[col],0.50)
q3 = np.quantile(dataset[col],0.75)
IQR = q3-q1
thresh = dataset[col].quantile(0.75)+(1.5 * IQR)
dataset.drop(dataset[dataset[col]>thresh].index, axis=0, inplace=True)
#display the shape of data frame
print("after fixing outliers shape of dataframe:",dataset.shape)
After fixing the outliers, we have 814 dataset rows.
le = LabelEncoder()
dataset['class'] = le.fit_transform(dataset['class'])
dataset['class'].unique()
dataset.head()
Labels -
0 - bus, 1 - car, 2 - van
## Changing the datatype of class to categorical
dataset["class"] = pd.Categorical(dataset["class"])
dataset.info()
Additionally changing the cols 'compactness', 'max.length_aspect_ratio', 'max.length_rectangularity' and 'hollows_ratio' to float for consistency
dataset['compactness'] = pd.Float64Index(dataset['compactness'])
dataset['max.length_aspect_ratio'] = pd.Float64Index(dataset['max.length_aspect_ratio'])
dataset['max.length_rectangularity'] = pd.Float64Index(dataset['max.length_rectangularity'])
dataset['hollows_ratio'] = pd.Float64Index(dataset['hollows_ratio'])
dataset.info()
#Input = dataset.drop()
cols_to_drop =['max.length_rectangularity','scaled_radius_of_gyration','skewness_about.2','scatter_ratio','elongatedness',
'pr.axis_rectangularity','scaled_variance','scaled_variance.1']
input = dataset.drop(cols_to_drop, axis='columns')
input.head()
input.info()
In this step we will split the data into training and testing set. We will keep the training size as 70%.
After splitting, we will also scale the independent variable
X = input.drop('class', axis = 'columns')
y = input['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
print(X_train.shape)
print(X_test.shape)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
X_train
y_train
We will be using kernel as rbf
svm_model = SVC(C = 30, degree = 3, gamma = "auto", kernel = "rbf")
svm_model.fit(X_train, y_train)
svm_score_rbf_train = svm_model.score(X_train, y_train)
print('SVM model score (Training Data):', svm_score_rbf_train)
svm_score_rbf = svm_model.score(X_test, y_test)
print('SVM model score (Testing Data):', svm_score_rbf)
svm_pred = svm_model.predict(X_test)
print("\nSVC - Classification Report")
class_report = classification_report(y_test, svm_pred, labels=[0,1,2])
print(class_report)
sns.heatmap(confusion_matrix(y_test,svm_pred), annot=True, cmap='Greens',fmt='g')
rep = classification_report(y_test, svm_pred, labels=[0,1,2], output_dict=True)
prec_bus_svm = rep['0']['precision']
recall_bus_svm = rep['0']['recall']
prec_car_svm = rep['1']['precision']
recall_car_svm = rep['1']['recall']
prec_van_svm = rep['2']['precision']
recall_van_svm = rep['2']['recall']
X1 = X.copy()
y1 = y.copy()
X1 = scale.fit_transform(X1)
X1
## Using the cv = 10
scores = cross_val_score(svm_model, X1, y1, cv = 10)
scores
svm_crossval_score_mean = scores.mean()
print('SVM classifier cross val mean score is:', svm_crossval_score_mean)
svm_crossval_score_std = scores.std()
print('SVM classifier cross val standard deviation is:', svm_crossval_score_std)
dataset_pca = dataset.copy()
dataset_pca.head()
X_pca = dataset_pca.drop('class', axis='columns')
y_pca = dataset_pca['class']
X_pca = X_pca.apply(zscore)
X_pca.head()
covMatrix = np.cov(X_pca,rowvar=False)
print(covMatrix)
pca = PCA(n_components=18)
pca.fit(X_pca)
X_pca
The Eigen vectors
print(pca.components_)
The Eigen Values
#display explained variance ratio
pca.explained_variance_ratio_
The percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
l = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
plt.bar(l,pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(l,np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cumulative of variation explained')
plt.xlabel('eigen Value')
plt.show()
From the above plots, it can be observed that 6 dimensions are enough to explain over 95% of the variation of the original data
pca6 = PCA(n_components=6)
pca6.fit(X_pca)
print(pca6.components_)
print(pca6.explained_variance_ratio_)
Xpca6 = pca6.transform(X_pca)
Xpca6
sns.pairplot(pd.DataFrame(Xpca6))
As depicted in the pairsplot above, there is no correlation between the independent variables
In this section we will perform the follwoing steps:-
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(Xpca6, y_pca, test_size = 0.3, random_state = 1)
y_train_pca
y_train
Y_train and y_train_pca are pointing to the same index.
Setting random_state = 1 to compute the accuracy scores after PCA transformation on the same set of data points as done earlier without PCA
svm_model_pca = SVC(C = 30, degree = 3, gamma = "auto", kernel = "rbf")
svm_model_pca.fit(X_train_pca, y_train_pca)
svm_score_rbf_train_pca = svm_model_pca.score(X_train_pca, y_train_pca)
print('SVM model score (Training Data) with PCA:', svm_score_rbf_train_pca)
svm_score_rbf_pca = svm_model_pca.score(X_test_pca, y_test_pca)
print('SVM model score (Testing Data) with PCA:', svm_score_rbf_pca)
svm_pred_pca = svm_model_pca.predict(X_test_pca)
print("\nSVC - Classification Report")
class_report = classification_report(y_test_pca, svm_pred_pca, labels=[0,1,2])
print(class_report)
sns.heatmap(confusion_matrix(y_test_pca,svm_pred_pca), annot=True, cmap='Pastel2_r',fmt='g')
rep = classification_report(y_test_pca, svm_pred_pca, labels=[0,1,2], output_dict=True)
prec_bus_svm_pca = rep['0']['precision']
recall_bus_svm_pca = rep['0']['recall']
prec_car_svm_pca = rep['1']['precision']
recall_car_svm_pca = rep['1']['recall']
prec_van_svm_pca = rep['2']['precision']
recall_van_svm_pca = rep['2']['recall']
scores = cross_val_score(svm_model_pca, Xpca6, y, cv = 10)
scores
svm_crossval_score_mean_pca = scores.mean()
print('SVM classifier cross val mean score after PCA transformation is:', svm_crossval_score_mean_pca)
svm_crossval_score_std_pca = scores.std()
print('SVM classifier cross val standard deviation after PCA transformation is:', svm_crossval_score_std_pca)
accu_scores = [svm_score_rbf, svm_score_rbf_pca]
legend = ['SVM (without PCA)', 'SVM (with PCA)']
plt.figure(figsize=(5, 2))
plt.title('Accuracy scores comparison')
sns.pointplot(legend, accu_scores, color= 'purple')
cv_scores = [svm_crossval_score_mean, svm_crossval_score_mean_pca]
legend = ['SVM (without PCA)', 'SVM (with PCA)']
plt.figure(figsize=(5, 2))
plt.title('Cross Val scores comparison')
sns.pointplot(legend, accu_scores, color= 'green')
print('The percentage drop in accurracy after reducing dimentions is {} %'
.format(((svm_score_rbf-svm_score_rbf_pca)/svm_score_rbf)*100))
As we see there is only 3.3.% reduction in the accuracy of the model after transforming through PCA. The positives after using PCA is that we have lesser dimentions so computing is better, also PCA reduces overfitting.